library(dplyr); library(reshape2); library(ggplot2)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
filter_zillow_data <- function(dat, date_start=7){
# Only new york
dat <- filter(dat, State=="NY", City=="New York")
# Prepare for plotting
dat_m <- melt(dat, id.vars=c("RegionName"),
measure.vars=colnames(dat)[date_start:length(colnames(dat))])
# Make more date-ey, paste on a day to make it unambiguous
dat_m$variable <- sapply(as.character(dat_m$variable),
function(x) paste(substr(x, 2, nchar(x)),".01",
sep="") )
dat_m$date <- as.Date(dat_m$variable, format="%Y.%m.%d")
return(dat_m)
}
Neighborhood_MedianListingPrice_AllHomes <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianListingPrice_AllHomes.csv")
dat_med_listing <- filter_zillow_data(Neighborhood_MedianListingPrice_AllHomes)
ggplot(data=dat_med_listing,
aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
coord_cartesian(ylim=c(200000,2000000)) +
ggtitle("Median Listing Prices in New York Neighborhoods") +
labs(x="Year", y="Median Listing Price (All Homes)")
Neighborhood_MedianSoldPrice_AllHomes <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianSoldPrice_AllHomes.csv")
dat_med_sold <- filter_zillow_data(Neighborhood_MedianSoldPrice_AllHomes, date_start=8)
ggplot(data=dat_med_sold,
aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
coord_cartesian(ylim=c(200000,2000000)) +
ggtitle("Median selling Prices in New York Neighborhoods") +
labs(x="Year", y="Median Sold Price (All Homes)")
This type of data is essentiall identical - would be interesting to see if there is a large difference
median_sell = dat_med_sold %>% group_by(date) %>% summarize("Med_Sell"=median(value, na.rm=TRUE))
median_list = dat_med_listing %>% group_by(date) %>% summarize("Med_Listing"=median(value, na.rm=TRUE))
medians <- merge(median_sell, median_list, by="date")
medians$Difference <- medians$Med_Sell - medians$Med_Listing
medians <- melt(medians, id.vars="date")
ggplot(data=filter(medians, variable != "Difference"), aes(x=date, y=value, group=variable, col=variable)) +
geom_line() +
geom_smooth() +
labs(x="Year", y="Price ($)") +
ggtitle("Difference in Median Listing and Sale Price for All NY Areas")
Need to interpolate/predict missing values!
medians_all <- merge(dat_med_listing, dat_med_sold, by=c("RegionName","date"))
medians_all <- select(medians_all, RegionName, date, value.x, value.y)
colnames(medians_all) <- c("RegionName", "date", "ListPrice","SoldPrice")
medians_all$Difference <- medians_all$ListPrice - medians_all$SoldPrice
ggplot(data=medians_all, aes(x=date, y=Difference, group=RegionName, col=RegionName)) +
geom_line(alpha=0.6) +
coord_cartesian(ylim=c(-500000,1000000)) +
guides(col=FALSE) +
labs(x="Year", y="List Price - Sold Price") +
ggtitle("Difference in Median Listing and Sale Price By Areas")
# Read data
Neighborhood_MedianRentalPrice_1Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianRentalPrice_1Bedroom.csv")
dat_med_rental_1br <- filter_zillow_data(Neighborhood_MedianRentalPrice_1Bedroom)
#There is a super jaggy data point. We can find it by looking for the point with the ridiculous SD
data.frame(dat_med_rental_1br %>%
group_by(RegionName) %>%
summarize("SD"=sd(value, na.rm=TRUE)) %>%
arrange(desc(SD)))[1,]
## RegionName SD
## 1 Murray Hill 871.2724
dat_med_rental_1br <- filter(dat_med_rental_1br, RegionName != "Murray Hill")
ggplot(data=dat_med_rental_1br, aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
ggtitle("Median Rental Prices in New York Neighborhoods (1BR)") +
labs(x="Year", y="Median Rental Price (1 BR Homes)")
# Lets look at the change in rental price over quarters
dat_med_rental_1br$lagged_val <- lag(dat_med_rental_1br$value)
dat_med_rental_1br$detrend_val <- dat_med_rental_1br$val - dat_med_rental_1br$lagged_val
ggplot(data=dat_med_rental_1br, aes(x=date, y=detrend_val, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
ggtitle("De-trended Median Rental Prices in New York Neighborhoods (1BR)") +
labs(x="Year", y="De-trended Median Rental Price (1 BR Homes)")
Neighborhood_MedianRentalPrice_2Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianRentalPrice_2Bedroom.csv")
dat_med_rental_2br <- filter_zillow_data(Neighborhood_MedianRentalPrice_2Bedroom, date_start=7)
dat_med_rental_2br <- filter(dat_med_rental_2br, RegionName != "Murray Hill")
ggplot(data=dat_med_rental_2br, aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
ggtitle("Median Rental Prices in New York Neighborhoods (2BR)") +
labs(x="Year", y="Median Rental Price (2 BR Homes)")
Neighborhood_MedianRentalPrice_3Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianRentalPrice_3Bedroom.csv")
dat_med_rental_3br <- filter_zillow_data(Neighborhood_MedianRentalPrice_3Bedroom, date_start=7)
dat_med_rental_3br <- filter(dat_med_rental_3br, RegionName != "Murray Hill")
ggplot(data=dat_med_rental_3br, aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
ggtitle("Median Rental Prices in New York Neighborhoods (3BR)") +
labs(x="Year", y="Median Rental Price (3 BR Homes)")
Neighborhood_MedianRentalPrice_4Bedroom <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianRentalPrice_4Bedroom.csv")
dat_med_rental_4br <- filter_zillow_data(Neighborhood_MedianRentalPrice_4Bedroom, date_start=7)
dat_med_rental_4br <- filter(dat_med_rental_4br, RegionName != "Murray Hill")
ggplot(data=dat_med_rental_4br, aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
#guides(col=FALSE) +
ggtitle("Median Rental Prices in New York Neighborhoods (4BR)") +
labs(x="Year", y="Median Rental Price (4 BR Homes)")
Neighborhood_MedianRentalPrice_AllHomes <- read.csv("~/Documents/github/CityPredictions/data/Zillow/raw/Neighborhood_MedianRentalPrice_AllHomes.csv")
dat_med_rental_AllHomes <- filter_zillow_data(Neighborhood_MedianRentalPrice_AllHomes, date_start=7)
dat_med_rental_AllHomes <- filter(dat_med_rental_AllHomes, RegionName != "Murray Hill")
ggplot(data=dat_med_rental_AllHomes, aes(x=date, y=value, group=RegionName, col=RegionName)) +
geom_line(alpha=0.5) +
guides(col=FALSE) +
ggtitle("Median Rental Prices in New York Neighborhoods (All)") +
labs(x="Year", y="Median Rental Price (All Homes)")